library(feather)
library(foreach)
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:plyr':
## 
##     here
## The following object is masked from 'package:base':
## 
##     date
library(imputeTS)
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
##   method             from    
##   fitted.fracdiff    fracdiff
##   residuals.fracdiff fracdiff
library(tictoc)
input_folder = "../../Seasonality-Restricted-Access-Repo/Data/Clue_subset/input/"
input_folder = "../../Seasonality-Restricted-Access-Repo/Data/Clue/input/"

tmp_folder = "../../Seasonality-Restricted-Access-Repo/Data/Clue/tmp/"
users = read_feather(path = paste0(input_folder, "users.feather"))
bc = read_feather(path = paste0(input_folder, "birth_control.feather"))


ggplot(users[users$country_area %in% names(sort(table(users$country_area),decreasing = TRUE))[1:10],], 
       aes(x = latest_birth_control, fill = country_area))+
  geom_bar()+
  facet_grid(country_area ~ .)

countries = c("Brazil - Central-West","Brazil - Northeast","United Kingdom","France","Germany","Mexico","United States - Midwest","United States - California","Canada","Australia")

birth_controls = c("pill","none","condoms")

j = which((users$country_area %in% countries) & (users$latest_birth_control %in% birth_controls))

ggplot(users[j,], 
       aes(x = latest_birth_control, fill = country_area))+
  geom_bar(position = "dodge")

users = users[j,]
tracking_files = list.files(paste0(input_folder,"tracking/")) 
tmp_folder_for_tracking_with_BC_and_country = paste0(tmp_folder,"tracking_with_BC_and_country/")
if(!dir.exists(tmp_folder_for_tracking_with_BC_and_country)){dir.create(tmp_folder_for_tracking_with_BC_and_country)}
tmp_folder_for_tracking_selected_users = paste0(tmp_folder,"tracking_selected_users/")
if(!dir.exists(tmp_folder_for_tracking_selected_users)){dir.create(tmp_folder_for_tracking_selected_users)}

tic()
sex_pop_time_series = foreach(tracking_file  = tracking_files, .combine = rbind) %do% {
  cat(tracking_file,"\n")
  
  tracking = read_feather(path = paste0(input_folder,"tracking/",tracking_file))
  m = match(tracking$user_id, users$user_id)
  tracking$country_area = users$country_area[m]
  tracking$latest_birth_control = users$latest_birth_control[m]
  tracking$file = tracking_file
  
  write_feather(tracking, path = paste0(tmp_folder_for_tracking_with_BC_and_country, tracking_file) )
  
  j = which((tracking$user_id %in% users$user_id) & 
              (tracking$category  %in% c("ANY","sex")))
  tracking = tracking[j,]
  
  write_feather(tracking, path = paste0(tmp_folder_for_tracking_selected_users, tracking_file) )

  sex_pop_time_series =  
    ddply(tracking,
          .(date,country_area,latest_birth_control,category,type),
          .fun = summarise,
          n = length(unique(user_id)))
  
  return(sex_pop_time_series)
}
## tracking0000.feather 
## tracking0001.feather 
## tracking0002.feather 
## tracking0003.feather 
## tracking0004.feather 
## tracking0005.feather 
## tracking0006.feather 
## tracking0007.feather 
## tracking0008.feather 
## tracking0009.feather 
## tracking0010.feather 
## tracking0011.feather 
## tracking0012.feather 
## tracking0013.feather 
## tracking0014.feather 
## tracking0015.feather 
## tracking0016.feather 
## tracking0017.feather 
## tracking0018.feather 
## tracking0019.feather
toc()
## 3209.89 sec elapsed
dim(sex_pop_time_series)
## [1] 2706381       6
write_feather(sex_pop_time_series, path = paste0(tmp_folder, "sex_pop_time_series.feather"))
ggplot(sex_pop_time_series, aes(x = date, y = n, col = type))+
  geom_line()+
  facet_grid(country_area ~ latest_birth_control)

sex_pop_ts = sex_pop_time_series[sex_pop_time_series$category == "sex",]
sex_pop_ts$key = paste0(sex_pop_ts$date, sex_pop_ts$country_area, sex_pop_ts$latest_birth_control)

any_pop_ts = sex_pop_time_series[sex_pop_time_series$category == "ANY",]
any_pop_ts$key = paste0(any_pop_ts$date, any_pop_ts$country_area, any_pop_ts$latest_birth_control)

sex_pop_ts$any = any_pop_ts$n[match(sex_pop_ts$key, any_pop_ts$key)]

sex_pop_ts$norm_n = sex_pop_ts$n / sex_pop_ts$any


ggplot(sex_pop_ts, aes(x = date, y = norm_n, col = type))+
  geom_line()+
  facet_grid(country_area ~ latest_birth_control)

j = which((sex_pop_ts$date > "2017-06-30") )

ggplot(sex_pop_ts[j,], aes(x = date, y = norm_n, col = type))+
  geom_line()+
  facet_grid(latest_birth_control ~ country_area)

j = which((sex_pop_ts$date > "2017-06-30") & 
            (sex_pop_ts$type %in% c("unprotected_sex","protected_sex")))

ggplot(sex_pop_ts[j,], aes(x = date, y = norm_n, col = latest_birth_control))+
  geom_line()+
  facet_grid(type ~ country_area)

sex_pop_ts_sum = aggregate(n ~ date + country_area + latest_birth_control + key, sex_pop_ts, sum)
sex_pop_ts_sum$any = any_pop_ts$n[match(sex_pop_ts_sum$key, any_pop_ts$key)]

sex_pop_ts_sum$norm_n = sex_pop_ts_sum$n / sex_pop_ts_sum$any


j = which((sex_pop_ts_sum$date > "2017-06-30") & (sex_pop_ts_sum$latest_birth_control %in% c("condoms","none","pill")))

ggplot(sex_pop_ts_sum[j,], aes(x = date, y = norm_n))+
  geom_line()+
  facet_grid(latest_birth_control ~ country_area)

sex_pop_ts_sum$weekday = wday(sex_pop_ts_sum$date)

sex_pop_ts_sum_wd = ddply(sex_pop_ts_sum,
                          .(weekday, country_area, latest_birth_control),
                          summarise,
                          n = sum(n),
                          any = sum(any))
sex_pop_ts_sum_wd$norm_n = sex_pop_ts_sum_wd$n / sex_pop_ts_sum_wd$any

j = which(sex_pop_ts_sum_wd$latest_birth_control %in% c("condoms","none","pill"))
ggplot(sex_pop_ts_sum_wd[j,], aes(x = weekday, y = norm_n, col = latest_birth_control) )+
  geom_line()+
  facet_grid(country_area ~ ., scale = "free")

sex_pop_ts_sum$month = month(sex_pop_ts_sum$date)

sex_pop_ts_sum_m = ddply(sex_pop_ts_sum,
                         .(month, country_area, latest_birth_control),
                         summarise,
                         n = sum(n),
                         any = sum(any))
sex_pop_ts_sum_m$norm_n = sex_pop_ts_sum_m$n / sex_pop_ts_sum_m$any

j = which(sex_pop_ts_sum_m$latest_birth_control %in% c("condoms","none","pill"))
ggplot(sex_pop_ts_sum_m[j,], aes(x = month, y = norm_n, col = latest_birth_control) )+
  geom_line()+
  facet_grid(country_area ~ ., scale = "free")+
  scale_x_continuous(breaks = 1:12)

j = which(sex_pop_ts_sum_m$latest_birth_control %in% c("condoms","none"))
ggplot(sex_pop_ts_sum_m[j,], aes(x = month, y = norm_n, col = latest_birth_control) )+
  geom_line()+
  facet_grid(country_area ~ ., scale = "free")+
  scale_x_continuous(breaks = 1:12)

sex_pop_ts_sum_m$country_area = factor(sex_pop_ts_sum_m$country_area, levels = countries)

j = which(sex_pop_ts_sum_m$latest_birth_control %in% c("condoms"))
ggplot(sex_pop_ts_sum_m[j,], aes(x = month, y = norm_n, col = country_area) )+
  geom_line()+  
  facet_grid(country_area ~ ., scale = "free")+
  scale_x_continuous(breaks = 1:12)

ok = foreach(country = unique(sex_pop_ts_sum$country_area)) %do% {
  
  cat(country, "\n")
  
  sub = sex_pop_ts_sum[which((sex_pop_ts_sum$country_area == country) & 
                               (sex_pop_ts_sum$latest_birth_control == "condoms") &
                               (sex_pop_ts_sum$date %in% seq(as.Date("2017-07-01"),as.Date("2019-06-30"),by = 1))),]
  t = seq(as.Date("2017-07-01"),as.Date("2019-06-30"),by = 1)
  x = sub$norm_n[match(t, sub$date)]
  x = na_interpolation(x)
  
  plot(sub$date, sub$norm_n, type = "l")
  
  plot(t, x, type = "l")
  
  tx = data.frame(t = 1:length(t), x = x)
  
  trend_model = loess(x ~ t, tx, degree = 2, span = 1)
  y = predict(trend_model, data.frame(t = tx$t))
  
  plot(tx$t, x, type = "l")
  points(tx$t, y, type = "l", col = "red")
  
  tx$trend = y
  tx$x_no_trend = tx$x - tx$trend
  
  signal_ts = ts(tx$x_no_trend, frequency = 7) 
  stl_w = stl(signal_ts, s.window = "periodic")
  plot(stl_w)
  
  tx$weekly = stl_w$time.series[,1]
  tx$x_no_trend_no_weekly = tx$x_no_trend - tx$weekly
  
  
  o = order(tx$x_no_trend_no_weekly, decreasing = TRUE)
  cat(as.character(sort(t[o[1:20]])),"\n")
  
  plot(t, tx$x_no_trend_no_weekly, type = "l")
  points(t[o[1:20]], tx$x_no_trend_no_weekly[o[1:20]], col = "red")
  
}
## Australia

## 2017-07-15 2017-08-04 2017-09-06 2017-10-28 2018-01-11 2018-01-22 2018-01-29 2018-02-26 2018-04-17 2018-05-01 2018-06-09 2018-09-30 2018-10-01 2018-10-21 2018-11-18 2019-01-12 2019-01-28 2019-03-10 2019-04-04 2019-04-25

## Brazil - Central-West

## 2017-07-07 2017-07-19 2017-09-02 2017-10-12 2018-01-01 2018-02-12 2018-02-13 2018-04-15 2018-04-29 2018-05-01 2018-05-31 2018-09-07 2018-10-12 2018-11-15 2019-01-01 2019-03-04 2019-03-31 2019-04-13 2019-05-01 2019-06-01

## Brazil - Northeast

## 2017-07-15 2017-07-25 2017-07-27 2017-09-04 2017-09-07 2017-09-15 2017-09-25 2017-10-07 2017-11-08 2017-11-14 2017-11-17 2017-11-24 2018-01-13 2018-01-21 2018-06-24 2018-07-22 2018-08-03 2018-09-21 2018-10-01 2019-06-12

## Canada

## 2017-07-16 2017-07-18 2017-08-05 2017-08-20 2017-10-04 2017-10-09 2017-10-15 2017-11-08 2018-01-23 2018-04-28 2018-06-02 2018-09-03 2018-09-09 2018-10-17 2018-10-19 2019-01-01 2019-02-02 2019-02-19 2019-03-31 2019-05-20

## France

## 2017-07-07 2017-07-30 2017-08-01 2017-08-14 2017-08-16 2017-08-21 2017-09-16 2017-09-17 2017-10-22 2018-01-01 2018-01-20 2018-04-07 2018-04-28 2018-05-01 2018-05-08 2018-05-20 2018-10-29 2019-01-01 2019-05-01 2019-05-30

## Germany

## 2017-07-27 2017-08-28 2017-10-03 2017-12-26 2017-12-27 2018-01-01 2018-03-10 2018-04-02 2018-04-07 2018-05-01 2018-05-10 2018-05-21 2018-10-03 2018-12-31 2019-01-01 2019-03-02 2019-04-28 2019-05-01 2019-05-30 2019-06-10

## Mexico

## 2017-07-05 2017-07-17 2018-01-01 2018-01-09 2018-01-19 2018-02-04 2018-02-14 2018-02-17 2018-02-23 2018-04-02 2018-04-21 2018-04-30 2018-05-01 2018-09-01 2018-09-22 2019-01-01 2019-02-04 2019-02-14 2019-03-17 2019-03-18

## United Kingdom

## 2017-07-03 2017-07-24 2017-10-14 2017-10-22 2017-12-27 2017-12-28 2018-01-01 2018-01-02 2018-01-14 2018-02-14 2018-03-17 2018-04-10 2018-05-07 2018-12-31 2019-01-01 2019-01-27 2019-02-14 2019-03-09 2019-05-06 2019-05-27

## United States - California

## 2017-07-02 2017-07-17 2017-07-24 2017-08-05 2017-08-08 2017-09-05 2017-09-28 2017-11-17 2017-11-22 2017-12-15 2018-01-01 2018-01-17 2018-03-24 2018-04-29 2018-06-09 2018-06-20 2018-07-23 2018-08-12 2019-01-01 2019-01-18

## United States - Midwest

## 2017-07-17 2017-07-22 2017-08-09 2017-08-14 2017-09-06 2017-09-11 2017-09-16 2017-09-24 2017-10-27 2017-11-11 2018-06-16 2018-06-26 2018-07-04 2018-07-30 2018-08-20 2018-11-24 2018-12-21 2019-01-01 2019-05-09 2019-06-02